In [1]:
import os
import shutil
import subprocess
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from keras import layers
from colorama import Fore, Style
from IPython.core.display import HTML
2024-04-08 10:14:27.321908: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-08 10:14:31.189267: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
In [2]:
easy_dataset_path = Path("/home/azureuser/cloudfiles/code/Users/NAZAR.KHOKHLA.2023/sentences.csv")
In [3]:
easy_dataset = pd.read_csv(easy_dataset_path, encoding="utf-8", engine="pyarrow")
easy_dataset = easy_dataset.sample(len(easy_dataset), random_state=42)
easy_dataset.head()
Out[3]:
English The Other Language
93238 A cold snap is expected this week. Bu hafta bir soğuk hava dalgası bekleniyor.
118815 Tom was married to Mary for three years. Tom üç yıldır Mary ile evliydi.
4568 We all know you. Biz hepimiz seni tanıyoruz.
93723 Have you ever eaten monkey brains? Hiç maymun beyni yedin mi?
49845 I bought half a dozen eggs. Yarım düzine yumurta aldım.
In [4]:
easy_dataset.info()
<class 'pandas.core.frame.DataFrame'>
Index: 145105 entries, 93238 to 121958
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   English             145105 non-null  object
 1   The Other Language  145105 non-null  object
dtypes: object(2)
memory usage: 3.3+ MB
In [5]:
easy_dataset["English Words in Sentence"] = (
    easy_dataset["English"].str.split().apply(len)
)
easy_dataset["Turkish Words in Sentence"] = (
    easy_dataset["The Other Language"].str.split().apply(len)
)

fig = px.histogram(
    easy_dataset,
    x=["English Words in Sentence", "Turkish Words in Sentence"],
    color_discrete_sequence=["#3f384a", "#e04c5f"],
    labels={"variable": "Variable", "value": "Words in Sentence"},
    marginal="box",
    barmode="group",
    height=540,
    width=840,
    title="Easy Dataset - Words in Sentence",
)
fig.update_layout(
    font_color="#141B4D",
    title_font_size=18,
    plot_bgcolor="#F6F5F5",
    paper_bgcolor="#F6F5F5",
    bargap=0.2,
    bargroupgap=0.1,
    legend=dict(orientation="h", yanchor="bottom", xanchor="right", y=1.02, x=1),
    yaxis_title="Count",
)
fig.show()
In [6]:
sentences_en = easy_dataset["English"].to_numpy()
sentences_tr = easy_dataset["The Other Language"].to_numpy()

valid_fraction = 0.1
valid_len = int(valid_fraction * len(easy_dataset))

sentences_en_train = sentences_en[:-valid_len]
sentences_tr_train = sentences_tr[:-valid_len]

sentences_en_valid = sentences_en[-valid_len:]
sentences_tr_valid = sentences_tr[-valid_len:]
In [7]:
def prepare_input_and_target(sentences_en, sentences_tr):
    """Return data in the format: `((encoder_input, decoder_input), target)`"""
    return (sentences_en, b"startofseq " + sentences_tr), sentences_tr + b" endofseq"


def from_sentences_dataset(
    sentences_en,
    sentences_tr,
    batch_size=32,
    cache=True,
    shuffle=False,
    shuffle_buffer_size=10_000,
    seed=None,
):
    """Creates `TensorFlow` dataset for encoder-decoder RNN from given sentences."""
    dataset = tf.data.Dataset.from_tensor_slices((sentences_en, sentences_tr))
    dataset = dataset.map(prepare_input_and_target, num_parallel_calls=tf.data.AUTOTUNE)
    if cache:
        dataset = dataset.cache()
    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size)
In [8]:
benchmark_ds = from_sentences_dataset(sentences_en_train, sentences_tr_train)
benchmark_ds = benchmark_ds.prefetch(tf.data.AUTOTUNE)
bench_results = tfds.benchmark(benchmark_ds, batch_size=32)
************ Summary ************

Examples/sec (First included) 40614.10 ex/sec (total: 130656 ex, 3.22 sec)
Examples/sec (First only) 270.53 ex/sec (total: 32 ex, 0.12 sec)
Examples/sec (First excluded) 42154.14 ex/sec (total: 130624 ex, 3.10 sec)
100%|██████████| 4082/4082 [00:03<00:00, 1312.97it/s]
In [9]:
example_ds = from_sentences_dataset(
    sentences_en_train, sentences_tr_train, batch_size=4
)
list(example_ds.take(1))[0]
2024-04-08 10:14:49.560787: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
Out[9]:
((<tf.Tensor: shape=(4,), dtype=string, numpy=
  array([b'A cold snap is expected this week.',
         b'Tom was married to Mary for three years.', b'We all know you.',
         b'Have you ever eaten monkey brains?'], dtype=object)>,
  <tf.Tensor: shape=(4,), dtype=string, numpy=
  array([b'startofseq Bu hafta bir so\xc4\x9fuk hava dalgas\xc4\xb1 bekleniyor.',
         b'startofseq Tom \xc3\xbc\xc3\xa7 y\xc4\xb1ld\xc4\xb1r Mary ile evliydi.',
         b'startofseq Biz hepimiz seni tan\xc4\xb1yoruz.',
         b'startofseq Hi\xc3\xa7 maymun beyni yedin mi?'], dtype=object)>),
 <tf.Tensor: shape=(4,), dtype=string, numpy=
 array([b'Bu hafta bir so\xc4\x9fuk hava dalgas\xc4\xb1 bekleniyor. endofseq',
        b'Tom \xc3\xbc\xc3\xa7 y\xc4\xb1ld\xc4\xb1r Mary ile evliydi. endofseq',
        b'Biz hepimiz seni tan\xc4\xb1yoruz. endofseq',
        b'Hi\xc3\xa7 maymun beyni yedin mi? endofseq'], dtype=object)>)
In [10]:
example_ds.cardinality()  # Number of batches per epoch.
Out[10]:
<tf.Tensor: shape=(), dtype=int64, numpy=32649>
In [11]:
CLR = (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
class ColoramaVerbose(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(
            f"{CLR}Epoch: {RED}{epoch + 1:02d}{CLR} -",
            f"{CLR}loss: {RED}{logs['loss']:.5f}{CLR} -",
            f"{CLR}accuracy: {RED}{logs['accuracy']:.5f}{CLR} -",
            f"{CLR}val_loss: {RED}{logs['val_loss']:.5f}{CLR} -",
            f"{CLR}val_accuracy: {RED}{logs['val_accuracy']:.5f}",
        )
In [12]:
def adapt_compile_and_fit(
    model,
    train_dataset,
    valid_dataset,
    n_epochs=25,
    n_patience=5,
    init_lr=0.001,
    lr_decay_rate=0.1,
    colorama_verbose=False,
):
    """Takes the model vectorization layers and adapts them to the training data.
    Then, it prepares the final datasets vectorizing targets and prefetching,
    and finally trains the given model. Additionally, provides learning rate scheduling
    (exponential decay), early stopping and colorama verbose."""

    model.vectorization_en.adapt(
        train_dataset.map(
            lambda sentences, target: sentences[0],  # English sentences.
            num_parallel_calls=tf.data.AUTOTUNE,
        )
    )
    model.vectorization_tr.adapt(
        train_dataset.map(
            lambda sentences, target: sentences[1] + b" endofseq",  # Turkish sentences.
            num_parallel_calls=tf.data.AUTOTUNE,
        )
    )

    train_dataset_prepared = train_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_tr(target)),
        num_parallel_calls=tf.data.AUTOTUNE,
    ).prefetch(tf.data.AUTOTUNE)

    valid_dataset_prepared = valid_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_tr(target)),
        num_parallel_calls=tf.data.AUTOTUNE,
    ).prefetch(tf.data.AUTOTUNE)

    early_stopping_cb = keras.callbacks.EarlyStopping(
        monitor="val_accuracy", patience=n_patience, restore_best_weights=True
    )
    
    # The line below doesn't work with multi-file interleaving.
    # n_decay_steps = n_epochs * train_dataset_prepared.cardinality().numpy()
    # Less elegant solution.
    n_decay_steps = n_epochs * len(list(train_dataset_prepared))
    scheduled_lr = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=init_lr,
        decay_steps=n_decay_steps,
        decay_rate=lr_decay_rate,
    )

    model_callbacks = [early_stopping_cb]
    verbose_level = 1
    if colorama_verbose:
        model_callbacks.append(ColoramaVerbose())
        verbose_level = 0

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=keras.optimizers.RMSprop(learning_rate=scheduled_lr),
        metrics=["accuracy"],
    )

    return model.fit(
        train_dataset_prepared,
        epochs=n_epochs,
        validation_data=valid_dataset_prepared,
        callbacks=model_callbacks,
        verbose=verbose_level,
    )
In [13]:
def translate(model, sentence_en):
    translation = ""
    for word_idx in range(model.max_sentence_len):
        X_encoder = np.array([sentence_en])
        X_decoder = np.array(["startofseq " + translation])
        # Last token's probas.
        y_proba = model.predict((X_encoder, X_decoder), verbose=0)[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = model.vectorization_tr.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()
In [14]:
class BidirectionalEncoderDecoderWithAttention(keras.Model):
    def __init__(
        self,
        vocabulary_size=5000,
        max_sentence_len=50,
        embedding_size=256,
        n_units_lstm=512,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.max_sentence_len = max_sentence_len

        self.vectorization_en = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )
        self.vectorization_tr = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )

        self.encoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )
        self.decoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )

        self.encoder = layers.Bidirectional(
            layers.LSTM(n_units_lstm // 2, return_sequences=True, return_state=True)
        )
        self.decoder = layers.LSTM(n_units_lstm, return_sequences=True)
        self.attention = layers.Attention()
        self.output_layer = layers.Dense(vocabulary_size, activation="softmax")

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs

        encoder_input_ids = self.vectorization_en(encoder_inputs)
        decoder_input_ids = self.vectorization_tr(decoder_inputs)

        encoder_embeddings = self.encoder_embedding(encoder_input_ids)
        decoder_embeddings = self.decoder_embedding(decoder_input_ids)

        # The final hidden state of the encoder, representing the entire
        # input sequence, is used to initialize the decoder.
        encoder_output, *encoder_state = self.encoder(encoder_embeddings)
        encoder_state = [
            tf.concat(encoder_state[0::2], axis=-1),  # Short-term state (0 & 2).
            tf.concat(encoder_state[1::2], axis=-1),  # Long-term state (1 & 3).
        ]
        decoder_output = self.decoder(decoder_embeddings, initial_state=encoder_state)
        attention_output = self.attention([decoder_output, encoder_output])

        return self.output_layer(attention_output)
In [15]:
keras.backend.clear_session()  # Resets all state generated by Keras.
tf.random.set_seed(42)  # Ensure reproducibility on CPU.

easy_train_ds = from_sentences_dataset(
    sentences_en_train, sentences_tr_train, shuffle=True, seed=42
)
easy_valid_ds = from_sentences_dataset(sentences_en_valid, sentences_tr_valid)

bidirect_encoder_decoder = BidirectionalEncoderDecoderWithAttention(max_sentence_len=15)
bidirect_history = adapt_compile_and_fit(
    bidirect_encoder_decoder,
    easy_train_ds,
    easy_valid_ds,
    init_lr=0.01,
    lr_decay_rate=0.01,
    colorama_verbose=True,
)
2024-04-08 10:15:07.435782: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2593 num_cores: 4 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 1048576 l3_cache_size: 37486592 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2024-04-08 10:25:28.540718: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:693] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "101" frequency: 2593 num_cores: 4 environment { key: "cpu_instruction_set" value: "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 1048576 l3_cache_size: 37486592 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
Epoch: 01 - loss: 3.32274 - accuracy: 0.43764 - val_loss: 2.61784 - val_accuracy: 0.50026
Epoch: 02 - loss: 2.35497 - accuracy: 0.52489 - val_loss: 2.29117 - val_accuracy: 0.53278
Epoch: 03 - loss: 2.00593 - accuracy: 0.56653 - val_loss: 2.12529 - val_accuracy: 0.55982
Epoch: 04 - loss: 1.76258 - accuracy: 0.60130 - val_loss: 2.01952 - val_accuracy: 0.57956
Epoch: 05 - loss: 1.56285 - accuracy: 0.63339 - val_loss: 1.96088 - val_accuracy: 0.59106
Epoch: 06 - loss: 1.38447 - accuracy: 0.66480 - val_loss: 1.91254 - val_accuracy: 0.60183
Epoch: 07 - loss: 1.22390 - accuracy: 0.69614 - val_loss: 1.88595 - val_accuracy: 0.61239
Epoch: 08 - loss: 1.07653 - accuracy: 0.72675 - val_loss: 1.88504 - val_accuracy: 0.61764
Epoch: 09 - loss: 0.94293 - accuracy: 0.75717 - val_loss: 1.89759 - val_accuracy: 0.61966
Epoch: 10 - loss: 0.82661 - accuracy: 0.78585 - val_loss: 1.90609 - val_accuracy: 0.62380
Epoch: 11 - loss: 0.72516 - accuracy: 0.81215 - val_loss: 1.93893 - val_accuracy: 0.62450
Epoch: 12 - loss: 0.63992 - accuracy: 0.83526 - val_loss: 1.96640 - val_accuracy: 0.62589
Epoch: 13 - loss: 0.56886 - accuracy: 0.85613 - val_loss: 1.99304 - val_accuracy: 0.62742
Epoch: 14 - loss: 0.51006 - accuracy: 0.87297 - val_loss: 2.02849 - val_accuracy: 0.62658
Epoch: 15 - loss: 0.46252 - accuracy: 0.88672 - val_loss: 2.05606 - val_accuracy: 0.62650
In [18]:
fig = px.line(
    bidirect_history.history,
    markers=True,
    height=540,
    width=840,
    symbol="variable",
    labels={"variable": "Variable", "value": "Value", "index": "Epoch"},
    title="Easy Dataset - Encoder-Decoder RNN Training Process",
    color_discrete_sequence=px.colors.diverging.balance_r,
)
fig.update_layout(
    font_color="#141B4D",
    title_font_size=18,
    plot_bgcolor="#F6F5F5",
    paper_bgcolor="#F6F5F5",
)
fig.show()
In [23]:
translation1 = translate(bidirect_encoder_decoder, "Tell Tom to take a seat.")
translation2 = translate(bidirect_encoder_decoder, "I wish Tom were dead.")
translation3 = translate(bidirect_encoder_decoder, "She ordered three dinners.")

print("Tell Tom to take a seat.     Tom'a oturmasını söyle.")
print("I wish Tom were dead.        Keşke Tom ölmüş olsa.")
print("She ordered three dinners.   O üç tane akşam yemeği sipariş etti.")
print()
print("Model Translations:")
print(translation1)
print(translation2)
print(translation3)
Tell Tom to take a seat.     Tom'a oturmasını söyle.
I wish Tom were dead.        Keşke Tom ölmüş olsa.
She ordered three dinners.   O üç tane akşam yemeği sipariş etti.

Model Translations:
toma [UNK] söyle
keşke tom ölmüş olsa
o [UNK] [UNK] etti
In [20]:
import mlflow
from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
In [21]:
loss_history =bidirect_history.history['loss']
In [26]:
# Assuming `train_dataset` and `bidirect_encoder_decoder` are available
model = bidirect_encoder_decoder
train_dataset = easy_train_ds

# Adapt the vectorization layers
model.vectorization_en.adapt(
    train_dataset.map(
        lambda sentences, target: sentences[0],  # English sentences.
        num_parallel_calls=tf.data.AUTOTUNE,
    )
)
In [27]:
model.vectorization_tr.adapt(
    train_dataset.map(
        lambda sentences, target: sentences[1] + b" endofseq",  # Turkish sentences.
        num_parallel_calls=tf.data.AUTOTUNE,
    )
)
In [28]:
train_dataset_prepared = train_dataset.map(
    lambda sentences, target: (sentences, model.vectorization_tr(target)),
    num_parallel_calls=tf.data.AUTOTUNE,
).prefetch(tf.data.AUTOTUNE)
In [ ]:
 
In [22]:
mlflow.set_experiment("Your-First-AI")

with mlflow.start_run( run_name='run1'):
    for loss in loss_history:
        mlflow.log_metric("loss", loss)
    signature = infer_signature(np.array(["Tell Tom to take a seat."]), translate(bidirect_encoder_decoder, "Tell Tom to take a seat."))
    conda_env = _mlflow_conda_env(
        additional_conda_deps=None,
        additional_pip_deps=None,
        additional_conda_channels=None
    )
mlflow.tensorflow.log_model(bidirect_encoder_decoder, "bidirect_encoder_decoder", conda_env=conda_env, signature=signature)
2024/04/08 14:37:53 INFO mlflow.tracking.fluent: Experiment with name 'Your-First-AI' does not exist. Creating a new experiment.
/anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/_distutils_hack/__init__.py:33: UserWarning:

Setuptools is replacing distutils.

---------------------------------------------------------------------------
MlflowException                           Traceback (most recent call last)
Cell In[22], line 12
      6     signature = infer_signature("Tell Tom to take a seat.", translate(bidirect_encoder_decoder, "Tell Tom to take a seat."))
      7     conda_env = _mlflow_conda_env(
      8         additional_conda_deps=None,
      9         additional_pip_deps=None,
     10         additional_conda_channels=None
     11     )
---> 12 mlflow.tensorflow.log_model(bidirect_encoder_decoder, "bidirect_encoder_decoder", conda_env=conda_env, signature=signature)

File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/mlflow/tensorflow/__init__.py:226, in log_model(model, artifact_path, custom_objects, conda_env, code_paths, signature, input_example, registered_model_name, await_registration_for, pip_requirements, extra_pip_requirements, saved_model_kwargs, keras_model_kwargs, metadata)
    128 @format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))
    129 def log_model(
    130     model,
   (...)
    143     metadata=None,
    144 ):
    145     """
    146     Log a TF2 core model (inheriting tf.Module) or a Keras model in MLflow Model format.
    147 
   (...)
    223              metadata of the logged model.
    224     """
--> 226     return Model.log(
    227         artifact_path=artifact_path,
    228         flavor=mlflow.tensorflow,
    229         model=model,
    230         conda_env=conda_env,
    231         code_paths=code_paths,
    232         custom_objects=custom_objects,
    233         registered_model_name=registered_model_name,
    234         signature=signature,
    235         input_example=input_example,
    236         await_registration_for=await_registration_for,
    237         pip_requirements=pip_requirements,
    238         extra_pip_requirements=extra_pip_requirements,
    239         saved_model_kwargs=saved_model_kwargs,
    240         keras_model_kwargs=keras_model_kwargs,
    241         metadata=metadata,
    242     )

File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/mlflow/models/model.py:562, in Model.log(cls, artifact_path, flavor, registered_model_name, await_registration_for, metadata, **kwargs)
    560 run_id = mlflow.tracking.fluent._get_or_start_run().info.run_id
    561 mlflow_model = cls(artifact_path=artifact_path, run_id=run_id, metadata=metadata)
--> 562 flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
    563 mlflow.tracking.fluent.log_artifacts(local_path, mlflow_model.artifact_path)
    564 tracking_uri = _resolve_tracking_uri()

File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/mlflow/tensorflow/__init__.py:385, in save_model(model, path, conda_env, code_paths, mlflow_model, custom_objects, signature, input_example, pip_requirements, extra_pip_requirements, saved_model_kwargs, keras_model_kwargs, metadata)
    383 for field in signature.inputs.inputs:
    384     if not isinstance(field, TensorSpec):
--> 385         raise MlflowException(
    386             "All fields in the model signature's input schema must be of type TensorSpec.",
    387             error_code=INVALID_PARAMETER_VALUE,
    388         )
    389     if field.shape[0] != -1:
    390         raise MlflowException(
    391             "All fields in the model signature's input schema must have a shape "
    392             "in which the first dimension is a variable dimension.",
    393             error_code=INVALID_PARAMETER_VALUE,
    394         )

MlflowException: All fields in the model signature's input schema must be of type TensorSpec.